Read data
rnafold_predicted_MFE <- read_tsv(
'/Volumes/Mitsu_NGS_2/METTL2A/RNAfold/m3Crnas/summary.tsv',
col_names = c('transcript_id', 'predicted_MFE', 'pair_prob')
)
## Rows: 71 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): transcript_id, predicted_MFE, pair_prob
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rnafold_predicted_MFE
## # A tibble: 71 × 3
## transcript_id predicted_MFE pair_prob
## <chr> <chr> <chr>
## 1 ENST00000009589.8 .((((((((((..(((((((.((.((((.(((.....((((((...(… .(((((((…
## 2 ENST00000199764.7 ..((((((((((((.(((.((((((.....((((.(((.((((((.(… .{((((((…
## 3 ENST00000202773.14 (((((((((((((....))))))(((((...................… ,{({{{,(…
## 4 ENST00000215754.8 ....(((((.((....)).)))))...((.((((((((((.(.((((… .,,.((((…
## 5 ENST00000229239.10 (((((((((((((((((((.(((..((.(((.((((((((((.(.((… ((((((((…
## 6 ENST00000230050.4 ....((((.((((((((((...(((.((.(.(((((.((..((((((… ,,{,{(((…
## 7 ENST00000233143.6 .....((((....))))((((((((.((((.((........))))))… ...,,{((…
## 8 ENST00000234875.9 .........((((.((..((((((((((..........))).)))))… .{{{{{{,…
## 9 ENST00000243997.8 ...((((((((((((.((((.((....((((.......))))..)).… ...(((((…
## 10 ENST00000254810.8 ((((((.((((((((((.(.((((...((((.((.....((((((((… ,,{{{{.(…
## # ℹ 61 more rows
m3C_sites <-
read_tsv(
paste_wd(
'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv'
)
) |>
dplyr::rename(position = kmer_middle) |>
add_structureinfo()
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
m3C_sites
## # A tibble: 489 × 17
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCG 58 62
## 5 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCT 76 80
## 6 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ATCAA 94 98
## 7 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA GCCAC 149 153
## 8 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCC 154 158
## 9 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCC 155 159
## 10 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCA 156 160
## # ℹ 479 more rows
## # ℹ 10 more variables: position <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # predicted_MFE <chr>, pair_prob <chr>, pos_dotbracket <chr>,
## # pos_pairprob <chr>
methylated_RNAs_C_positions <-
read_tsv(
'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
) |>
right_join(m3C_sites |> select(transcript_id, gene_name, genetype2) |> distinct()) |>
calc_base_position() |>
filter(base == 'C') |>
add_structureinfo()
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_C_positions
## # A tibble: 24,117 × 10
## transcript_id base transcript_length gene_name genetype2 position
## <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 ENST00000429711.7 C 2094 RPL32 mRNA 3
## 2 ENST00000429711.7 C 2094 RPL32 mRNA 4
## 3 ENST00000429711.7 C 2094 RPL32 mRNA 5
## 4 ENST00000429711.7 C 2094 RPL32 mRNA 9
## 5 ENST00000429711.7 C 2094 RPL32 mRNA 11
## 6 ENST00000429711.7 C 2094 RPL32 mRNA 13
## 7 ENST00000429711.7 C 2094 RPL32 mRNA 14
## 8 ENST00000429711.7 C 2094 RPL32 mRNA 16
## 9 ENST00000429711.7 C 2094 RPL32 mRNA 17
## 10 ENST00000429711.7 C 2094 RPL32 mRNA 20
## # ℹ 24,107 more rows
## # ℹ 4 more variables: predicted_MFE <chr>, pair_prob <chr>,
## # pos_dotbracket <chr>, pos_pairprob <chr>
allC_methylatedRNAs_regioninfo <-
read_tsv(
'Tables/DRS_m3C_sites/Metagene_CDS/allC_methylatedRNAs_regioninfo_2024-06-05.tsv' |>
paste_wd()
)
## Rows: 22334 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, base, kmer_region, gene_name, gene_type, genetype2
## dbl (6): kmer_middle, start, end, thickStart, thickEnd, length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
allC_methylatedRNAs_regioninfo
## # A tibble: 22,334 × 12
## transcript_id base kmer_middle start end thickStart thickEnd length
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000429711.7 C 3 0 2094 77 482 2094
## 2 ENST00000429711.7 C 4 0 2094 77 482 2094
## 3 ENST00000429711.7 C 5 0 2094 77 482 2094
## 4 ENST00000429711.7 C 9 0 2094 77 482 2094
## 5 ENST00000429711.7 C 11 0 2094 77 482 2094
## 6 ENST00000429711.7 C 13 0 2094 77 482 2094
## 7 ENST00000429711.7 C 14 0 2094 77 482 2094
## 8 ENST00000429711.7 C 16 0 2094 77 482 2094
## 9 ENST00000429711.7 C 17 0 2094 77 482 2094
## 10 ENST00000429711.7 C 20 0 2094 77 482 2094
## # ℹ 22,324 more rows
## # ℹ 4 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## # genetype2 <chr>
DRS_methylated_positions_CDSpos <-
read_tsv(
'Tables/DRS_m3C_sites/Metagene_CDS/DRS_methylated_positions_CDSpos_2024-06-05.tsv' |>
paste_wd()
)
## Rows: 436 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2,...
## dbl (11): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kme...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions_CDSpos
## # A tibble: 436 × 18
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCA 33 37
## 5 ENST00000361390.2 MT-ND1 chrM protein_cod… CCCCT 123 127
## 6 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCT 141 145
## 7 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCG 186 190
## 8 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCT 205 209
## 9 ENST00000361390.2 MT-ND1 chrM protein_cod… CCCCC 260 264
## 10 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCTC 322 326
## # ℹ 426 more rows
## # ℹ 11 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>, kmer_region <chr>
gencode_annotation <-
read_tsv(
'Tables/Database/gencode.v43.annotation.tsv' |> paste_wd()
) |>
dplyr::rename(seqname = seq_id) |>
add_genetype2() |>
filter(primary_tag == 'transcript')
## Rows: 3422892 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): seq_id, source_tag, primary_tag, score, frame, artif_dupl, ccdsid,...
## dbl (4): start, end, strand, level
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_annotation <-
read_tsv('/Volumes/Mitsu_NGS_2/METTL2A/Database/Custom/Espresso_AsPC1/Espresso_AsPC1_annotation_standardized.tsv')
## Rows: 285554 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (10): seq_id, source_tag, primary_tag, score, frame, exon_number, gene_i...
## dbl (3): start, end, strand
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#read_gtf_transcripts('/Volumes/Mitsu_NGS_2/METTL2A/Database/Custom/Espresso_AsPC1/Espresso_AsPC1_annotation_geneplus.gtf')
espresso_AsPC1_annotation
## # A tibble: 285,554 × 13
## seq_id source_tag primary_tag start end score strand frame exon_number
## <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 chrM annotated_isof… gene 577 647 . 1 . N/A
## 2 chrM annotated_isof… transcript 577 647 . 1 . N/A
## 3 chrM annotated_isof… exon 577 647 . 1 . 1
## 4 chrM annotated_isof… gene 648 1601 . 1 . N/A
## 5 chrM annotated_isof… transcript 648 1601 . 1 . N/A
## 6 chrM annotated_isof… exon 648 1601 . 1 . 1
## 7 chrM annotated_isof… gene 1671 3229 . 1 . N/A
## 8 chrM annotated_isof… transcript 1671 3229 . 1 . N/A
## 9 chrM annotated_isof… exon 1671 3229 . 1 . 1
## 10 chrM annotated_isof… gene 3307 4262 . 1 . N/A
## # ℹ 285,544 more rows
## # ℹ 4 more variables: gene_id <chr>, ID <chr>, Parent <chr>,
## # transcript_id <chr>
structuredness_Espresso_AsPC1_RNAs <-
calc_structuredness_Espresso_AsPC1_RNAs() |>
add_m3CRNA_info() |>
full_join(gencode_annotation)
## Rows: 36717 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, MFE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
structuredness_Espresso_AsPC1_RNAs
## # A tibble: 252,870 × 33
## transcript_id MFE transcript_length structuredness m3CRNA seqname
## <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 ENST00000000233.10 -387. 1032 0.375 others chr7
## 2 ENST00000000412.8 -674. 2450 0.275 others chr12
## 3 ENST00000000442.11 -1014. 2274 0.446 others chr11
## 4 ENST00000001008.6 -1110 3715 0.299 others chr12
## 5 ENST00000002125.9 -565. 2184 0.259 others chr2
## 6 ENST00000002165.11 -644. 2385 0.270 others chr6
## 7 ENST00000002501.11 -784. 2056 0.381 others chr16
## 8 ENST00000002596.6 -1885. 7160 0.263 others chr4
## 9 ENST00000003100.13 -803. 3155 0.254 others chr7
## 10 ENST00000003583.12 -750. 2544 0.295 others chr1
## # ℹ 252,860 more rows
## # ℹ 27 more variables: source_tag <chr>, primary_tag <chr>, start <dbl>,
## # end <dbl>, score <chr>, strand <dbl>, frame <chr>, artif_dupl <chr>,
## # ccdsid <chr>, exon_id <chr>, exon_number <chr>, gene_id <chr>,
## # gene_name <chr>, gene_type <chr>, havana_gene <chr>,
## # havana_transcript <chr>, hgnc_id <chr>, ID <chr>, level <dbl>, ont <chr>,
## # Parent <chr>, protein_id <chr>, tag <chr>, transcript_name <chr>, …
structuredness_Espresso_AsPC1_RNAs |>
filter(!is.na(transcript_type)) |>
filter(!is.na(m3CRNA)) |>
filter(genetype2 == 'mRNA' & transcript_type == 'protein_coding') |>
ggplot(aes(y = structuredness, x = m3CRNA)) +
geom_boxplot() +
#geom_density() +
facet_wrap( ~ transcript_type)

structuredness_Espresso_AsPC1_RNAs_filtered <-
structuredness_Espresso_AsPC1_RNAs |>
filter(!is.na(transcript_type)) |>
filter(!is.na(m3CRNA)) |>
filter(transcript_type == 'protein_coding' | transcript_type == 'Mt_rRNA')
structuredness_Espresso_AsPC1_RNAs_filtered |>
export_tsv(outdir = tabledir, compression = 'gz')
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/RNAfold/structuredness_Espresso_AsPC1_RNAs_filtered_2024-07-29.tsv.gz
## # A tibble: 19,786 × 33
## transcript_id MFE transcript_length structuredness m3CRNA seqname
## <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 ENST00000000233.10 -387. 1032 0.375 others chr7
## 2 ENST00000000412.8 -674. 2450 0.275 others chr12
## 3 ENST00000000442.11 -1014. 2274 0.446 others chr11
## 4 ENST00000001008.6 -1110 3715 0.299 others chr12
## 5 ENST00000002125.9 -565. 2184 0.259 others chr2
## 6 ENST00000002165.11 -644. 2385 0.270 others chr6
## 7 ENST00000002501.11 -784. 2056 0.381 others chr16
## 8 ENST00000002596.6 -1885. 7160 0.263 others chr4
## 9 ENST00000003100.13 -803. 3155 0.254 others chr7
## 10 ENST00000003583.12 -750. 2544 0.295 others chr1
## # ℹ 19,776 more rows
## # ℹ 27 more variables: source_tag <chr>, primary_tag <chr>, start <dbl>,
## # end <dbl>, score <chr>, strand <dbl>, frame <chr>, artif_dupl <chr>,
## # ccdsid <chr>, exon_id <chr>, exon_number <chr>, gene_id <chr>,
## # gene_name <chr>, gene_type <chr>, havana_gene <chr>,
## # havana_transcript <chr>, hgnc_id <chr>, ID <chr>, level <dbl>, ont <chr>,
## # Parent <chr>, protein_id <chr>, tag <chr>, transcript_name <chr>, …
structuredness_Espresso_AsPC1_RNAs_filtered_wilcox <-
structuredness_Espresso_AsPC1_RNAs_filtered |>
filter(genetype2 != 'Mt_rRNA') |>
group_by(genetype2) |>
rstatix::wilcox_test(structuredness ~ m3CRNA, ref = 'others') |>
rstatix::add_y_position()
structuredness_Espresso_AsPC1_RNAs_m3C_sinaplot <-
structuredness_Espresso_AsPC1_RNAs_filtered |>
ggplot(aes(y = structuredness, x = m3CRNA, colour = m3CRNA)) +
ggforce::geom_sina(size = 1) +
geom_boxplot(width = .5) +
scale_color_manual(values = c('red', 'gray30')) +
ggpubr::stat_pvalue_manual(
data = structuredness_Espresso_AsPC1_RNAs_filtered_wilcox,
tip.length = 0, coord.flip = TRUE
) +
coord_flip() +
facet_wrap( ~ genetype2, ncol = 1)
structuredness_Espresso_AsPC1_RNAs_m3C_sinaplot |>
ggsave_multiple_formats(
width = 4.5, height = 6, fontsize = 7, outdir = figdir
)

structuredness_Espresso_AsPC1_RNAs |>
filter(genetype2 == 'mRNA') |>
ggplot(aes(x = structuredness, colour = m3CRNA)) +
stat_ecdf() +
#geom_boxplot() +
#geom_density() +
facet_wrap( ~ transcript_type)
## Warning: Removed 141469 rows containing non-finite values (`stat_ecdf()`).
